
global root_dir = "`1'"

include "$root_dir/code/config/config.do"


cap noi log using ${log_dir}/8_count_by_LZ_naics_mapping.log, replace name(dat)

*Handle empty arguments
global arg1 = cond("`2'" == "___EMPTY___", "", "`2'")
global arg2 = cond("`3'" == "___EMPTY___", "", "`3'")
global arg3 = cond("`4'" == "___EMPTY___", "", "`4'")
global arg4 = cond("`5'" == "___EMPTY___", "", "`5'")

if "$arg1" != "" {
    global weight_category "$arg1"
    di "Weight category: ${weight_category}"
}

if "$arg2" != "" {
    global weight_versions "$arg2"
    di "Weight versions: ${weight_versions}"
}

if "$arg3" != "" {
    global weight_window "$arg3"
    di "Weight window: ${weight_window}"
}

if "$arg4" != "" {
	global wtype "$arg4"
}
di "${wtype}"
capture noi {

/* v2 This do-file repeats the previous step with a different mapping from IPC4 into the industry of manufcaturing. 
Namely, the IPC4-NAICS1997 mapping from Lybbert and Zolas (2014). We refer to this as the "LZ" approach. 

Inputs:
- IPC4_NAICS1997.dta (by LZ (2014))
- US_docdb_cipc.dta
- industry-crosswalk-90-00-02-07-12.xls 
- ind90.dta

Output:
- patents_ind6090_lz_ipc4.dta
*/

* ------------------------------------------- *
* A. Count patents by NAICS 1997 industry code 
* ------------------------------------------- *
* crosswalk 
*Lybbert, Travis J., and Nikolas J. Zolas. 2014. "Getting Patents and Economic Data to Speak to Each Other: 
*An 'Algorithmic Links with Probabilities' Approach for Joint Analyses of Patenting and Economic Activity." Research Policy 43 (3): 530-542. acessed January 18th 2019
* https://drive.google.com/file/d/16YDRz7G8IaKGA1T7g32GP3ajrNBySbZE/view
insheet using ${alm_data_raw}/concordances/lz/ipc4_to_naics97_6.txt, name
tempfile IPC4_NAICS1997
save "`IPC4_NAICS1997'", replace

use ${alm_data_proc}/US_docdb_cipc.dta, clear

gen ipc4=substr(cipc6, 1, 4)
drop if inlist(ipc4,"A99Z","B68C","B68F","B82T","B99Z","C12J","C13D","C13F","C13G")
drop if inlist(ipc4,"E02C","E99Z","F21H","F99Z","G10B","G10F","G21J","G99Z","H04T")
drop if inlist(ipc4,"H99Z")

duplicates tag docdb_family_id, generate(dupl_docdb)
gen fraction_docdb = 1/(dupl_docdb + 1)
gen tot = fraction_docdb
foreach tech in tot auto90 auto95 pauto90 pauto90_rm6 pauto95 in_relevant_field {
	replace `tech' = fraction_docdb * `tech'
	gen `tech'_b = `tech' * bia
}
keep if inrange(appln_year, 1970, 2004)
collapse (sum) tot tot_b auto90 auto90_b auto95 auto95_b pauto90 pauto90_b pauto90_rm6 pauto90_rm6_b pauto95 pauto95_b in_relevant_field in_relevant_field_b, by (ipc4 appln_year)

mmerge ipc4 using `IPC4_NAICS1997', unmatched(both)
keep if _merge == 3 
drop _m
foreach tech in tot tot_b auto90 auto90_b auto95 auto95_b pauto90 pauto90_b pauto90_rm6 pauto90_rm6_b pauto95 pauto95_b in_relevant_field in_relevant_field_b {
	replace `tech' = probability_weight * `tech'
}
collapse (sum) tot tot_b auto90 auto90_b auto95 auto95_b pauto90 pauto90_b pauto90_rm6 pauto90_rm6_b pauto95 pauto95_b in_relevant_field in_relevant_field_b, by (naics97_6 appln_year)
sort naics97_6 appln_year
tempfile patents_naics97
save "`patents_naics97'", replace

* ------------------------------------------- *
* B. Map to ind 6090
* ------------------------------------------- *

* Import the NAICS 1997 to 1990 Census concordance table 
* U.S. Census Bureau. n.d. "Industry and Occupation Code Lists & Crosswalks." Accessed March 2023. https://www.census.gov/topics/employment/industry-occupation/guidance/code-lists.html.
import excel ${alm_data_raw}/concordances/industry-crosswalk-90-00-02-07-12.xls, sheet("Sheet1") firstrow clear
destring Census, generate (ind90)
drop Census 
drop if ind90 ==.
drop if NAICS6digit ==""
rename NAICS6digit NAICSagg
tempfile NAICS1997_1990Census
save `NAICS1997_1990Census', replace

use "`patents_naics97'", clear
* change the nacis97_6 to merge it with the concordance table NAICS 1997 to 1990 Census
gen NAICSagg = ""
replace NAICSagg ="111000" if naics97_6 >= 111000 & naics97_6<112000 
replace NAICSagg ="112000" if naics97_6 >= 112000 & naics97_6<113000 
replace NAICSagg ="113100" if naics97_6 >= 113100 & naics97_6<113200
replace NAICSagg ="113200" if naics97_6 >= 113200 & naics97_6<113300
replace NAICSagg ="113300" if naics97_6 >= 113300 & naics97_6<113400
replace NAICSagg ="114000" if naics97_6 >= 114000 & naics97_6<115000
replace NAICSagg ="115000" if naics97_6 >= 115000 & naics97_6<116000
replace NAICSagg ="211000" if naics97_6 >= 211000 & naics97_6<212000
replace NAICSagg ="212100" if naics97_6 >= 212100 & naics97_6<212200
replace NAICSagg ="212200" if naics97_6 >= 212200 & naics97_6<212300
replace NAICSagg ="212300" if naics97_6 >= 212300 & naics97_6<212400
replace NAICSagg ="213000" if naics97_6 >= 213000 & naics97_6<214000
replace NAICSagg ="221100" if naics97_6 >= 221100 & naics97_6<221200
replace NAICSagg ="221200" if naics97_6 >= 221200 & naics97_6<221300
replace NAICSagg ="221310" if naics97_6 >= 221310 & naics97_6<221320
replace NAICSagg ="221320" if naics97_6 >= 221320 & naics97_6<221330
replace NAICSagg ="221330" if naics97_6 >= 221330 & naics97_6<221340
replace NAICSagg ="230000" if naics97_6 >= 230000 & naics97_6<240000
replace NAICSagg ="311100" if naics97_6 >= 311100 & naics97_6<311200
replace NAICSagg ="311200" if naics97_6 >= 311200 & naics97_6<311300
replace NAICSagg ="311300" if naics97_6 >= 311300 & naics97_6<311400
replace NAICSagg ="311400" if naics97_6 >= 311400 & naics97_6<311500
replace NAICSagg ="311500" if naics97_6 >= 311500 & naics97_6<311600
replace NAICSagg ="311600" if naics97_6 >= 311600 & naics97_6<311700
replace NAICSagg ="311700" if naics97_6 >= 311700 & naics97_6<311800
replace NAICSagg ="311800 exc. 311811" if naics97_6 >= 311800 & naics97_6<311900
replace NAICSagg ="311811" if naics97_6 == 311811
replace NAICSagg ="311900" if naics97_6 >= 311900 & naics97_6<312000
replace NAICSagg ="312100" if naics97_6 >= 312100 & naics97_6<312200
replace NAICSagg ="312200" if naics97_6 >= 312200 & naics97_6<312300
replace NAICSagg ="313100" if naics97_6 >= 313100 & naics97_6<313200
replace NAICSagg ="313200 exc. 313240" if naics97_6 >= 313200 & naics97_6<313300
replace NAICSagg ="313240" if naics97_6 >= 313240 & naics97_6<313250
replace NAICSagg ="313300" if naics97_6 >= 313300 & naics97_6<313400
replace NAICSagg ="314000 exc. 314110" if naics97_6 >= 314000 & naics97_6<315000
replace NAICSagg ="314110" if naics97_6 >= 314110 & naics97_6<314120
replace NAICSagg ="315100" if naics97_6 >= 315100 & naics97_6<315200
replace NAICSagg ="315200" if naics97_6 >= 315200 & naics97_6<315300
replace NAICSagg ="315900" if naics97_6 >= 315900 & naics97_6<316000
replace NAICSagg ="316110" if naics97_6 >= 316110 & naics97_6<316120
replace NAICSagg ="316200" if naics97_6 >= 316200 & naics97_6<316300
replace NAICSagg ="316900" if naics97_6 >= 316900 & naics97_6<317000
replace NAICSagg ="321100" if naics97_6 >= 321100 & naics97_6<321200
replace NAICSagg ="321200" if naics97_6 >= 321200 & naics97_6<321300
replace NAICSagg ="321900 exc. 321991,321992" if naics97_6 >= 321900 & naics97_6<322000
replace NAICSagg ="321991" if naics97_6 == 321991
replace NAICSagg ="321992" if naics97_6 == 321992
replace NAICSagg ="322100" if naics97_6 >= 322100 & naics97_6<322200
replace NAICSagg ="322210" if naics97_6 >= 322210 & naics97_6<322220
replace NAICSagg ="322220" if naics97_6 >= 322220 & naics97_6<322230
replace NAICSagg ="322230" if naics97_6 >= 322230 & naics97_6<322240
replace NAICSagg ="322290" if naics97_6 >= 322290 & naics97_6<322300
replace NAICSagg ="323100" if naics97_6 >= 323100 & naics97_6<323200
replace NAICSagg ="324110" if naics97_6 >= 324110 & naics97_6<324120
replace NAICSagg ="324120" if naics97_6 >= 324120 & naics97_6<324130
replace NAICSagg ="324190" if naics97_6 >= 324190 & naics97_6<324200
replace NAICSagg ="325100" if naics97_6 >= 325100 & naics97_6<325200
replace NAICSagg ="325200" if naics97_6 >= 325200 & naics97_6<325300
replace NAICSagg ="325300" if naics97_6 >= 325300 & naics97_6<325400
replace NAICSagg ="325400" if naics97_6 >= 325400 & naics97_6<325500
replace NAICSagg ="325500" if naics97_6 >= 325500 & naics97_6<325600
replace NAICSagg ="325600" if naics97_6 >= 325600 & naics97_6<325700
replace NAICSagg ="325900" if naics97_6 >= 325900 & naics97_6<326000
replace NAICSagg ="326100" if naics97_6 >= 326100 & naics97_6<326200
replace NAICSagg ="326210" if naics97_6 >= 326210 & naics97_6<326220
replace NAICSagg ="326220" if naics97_6 >= 326220 & naics97_6<326230
replace NAICSagg ="326290" if naics97_6 >= 326290 & naics97_6<326300
replace NAICSagg ="327110" if naics97_6 >= 327110 & naics97_6<327120
replace NAICSagg ="327120" if naics97_6 >= 327120 & naics97_6<327130
replace NAICSagg ="327200" if naics97_6 >= 327200 & naics97_6<327300
replace NAICSagg ="327300" if naics97_6 >= 327300 & naics97_6<327400
replace NAICSagg ="327400" if naics97_6 >= 327400 & naics97_6<327500
replace NAICSagg ="327900" if naics97_6 >= 327900 & naics97_6<328000
replace NAICSagg ="331100" if naics97_6 >= 331100 & naics97_6<331200
replace NAICSagg ="331200" if naics97_6 >= 331200 & naics97_6<331300
replace NAICSagg ="331300" if naics97_6 >= 331300 & naics97_6<331400
replace NAICSagg ="331400" if naics97_6 >= 331400 & naics97_6<331500
replace NAICSagg ="331500" if naics97_6 >= 331500 & naics97_6<331600
replace NAICSagg ="332100" if naics97_6 >= 332100 & naics97_6<332200
replace NAICSagg ="332200" if naics97_6 >= 332200 & naics97_6<332300
replace NAICSagg ="332300" if naics97_6 >= 332300 & naics97_6<332400
replace NAICSagg ="332400" if naics97_6 >= 332400 & naics97_6<332500
replace NAICSagg ="332500" if naics97_6 >= 332500 & naics97_6<332600
replace NAICSagg ="332600" if naics97_6 >= 332600 & naics97_6<332700
replace NAICSagg ="332700" if naics97_6 >= 332700 & naics97_6<332800
replace NAICSagg ="332800" if naics97_6 >= 332800 & naics97_6<332900
replace NAICSagg ="332900 exc. 332992,332993,332994,332995" if naics97_6 >= 332900 & naics97_6<333000
replace NAICSagg ="332992" if naics97_6 == 332992
replace NAICSagg ="332993" if naics97_6 == 332993
replace NAICSagg ="332994" if naics97_6 == 332994
replace NAICSagg ="332995" if naics97_6 == 332995
replace NAICSagg ="333110" if naics97_6 >= 333110 & naics97_6<333120
replace NAICSagg ="333120" if naics97_6 >= 333120 & naics97_6<333130
replace NAICSagg ="333130" if naics97_6 >= 333130 & naics97_6<333140
replace NAICSagg ="333200" if naics97_6 >= 333200 & naics97_6<333300
replace NAICSagg ="333300 exc. 333315" if naics97_6 >= 333300 & naics97_6<333400
replace NAICSagg ="333315" if naics97_6 == 333315
replace NAICSagg ="333400" if naics97_6 >= 333400 & naics97_6<333500
replace NAICSagg ="333500" if naics97_6 >= 333500 & naics97_6<333600
replace NAICSagg ="333600" if naics97_6 >= 333600 & naics97_6<333700
replace NAICSagg ="333900" if naics97_6 >= 333900 & naics97_6<334000
replace NAICSagg ="334100" if naics97_6 >= 334100 & naics97_6<334200
replace NAICSagg ="334200" if naics97_6 >= 334200 & naics97_6<334300
replace NAICSagg ="334300" if naics97_6 >= 334300 & naics97_6<334400
replace NAICSagg ="334400" if naics97_6 >= 334400 & naics97_6<334500
replace NAICSagg ="334500 exc. 334518" if naics97_6 >= 334500 & naics97_6<334600
replace NAICSagg ="334518" if naics97_6 == 334518
replace NAICSagg ="334600" if naics97_6 >= 334600 & naics97_6<334700
replace NAICSagg ="335100" if naics97_6 >= 335100 & naics97_6<335200
replace NAICSagg ="335200" if naics97_6 >= 335200 & naics97_6<335300
replace NAICSagg ="335300" if naics97_6 >= 335300 & naics97_6<335400
replace NAICSagg ="335900" if naics97_6 >= 335900 & naics97_6<336000
replace NAICSagg ="336100" if naics97_6 >= 336100 & naics97_6<336200
replace NAICSagg ="336200" if naics97_6 >= 336200 & naics97_6<336300
replace NAICSagg ="336300" if naics97_6 >= 336300 & naics97_6<336400
replace NAICSagg ="336500" if naics97_6 >= 336500 & naics97_6<336600
replace NAICSagg ="336600" if naics97_6 >= 336600 & naics97_6<336700
replace NAICSagg ="336900" if naics97_6 >= 336900 & naics97_6<337000
replace NAICSagg ="337000" if naics97_6 >= 337000 & naics97_6<338000
replace NAICSagg ="339100" if naics97_6 >= 339100 & naics97_6<339200
replace NAICSagg ="339900 exc. 339920,339930" if naics97_6 >= 339900 & naics97_6<340000
replace NAICSagg ="339920" if naics97_6 >= 339920 & naics97_6<339930
replace NAICSagg ="339930" if naics97_6 >= 339930 & naics97_6<339940
replace NAICSagg ="336411" if naics97_6 == 336411
replace NAICSagg ="336412" if naics97_6 == 336412
replace NAICSagg ="336413" if naics97_6 == 336413
replace NAICSagg ="336414" if naics97_6 == 336414
replace NAICSagg ="336415" if naics97_6 == 336415
replace NAICSagg ="336419" if naics97_6 == 336419

* merge with the concordance table NAICS 1997 to 1990 Census
* Note: NAICSagg 213000 could not be matched - thus dropped
joinby NAICSagg using "`NAICS1997_1990Census'"

* Update the patent counts per ind90
* Divide NAICSagg 221100 & 221200 equaly among the ind90 they map into 
duplicates tag appln_year NAICSagg naics97_6, gen(dup1)
replace dup1 = dup1 + 1

foreach tech in auto90 auto95 pauto90 pauto95 in_relevant_field in_relevant_field_b tot_b auto90_b auto95_b pauto90_b pauto95_b { 
	replace `tech' = `tech'/dup1
	egen `tech'_ind90 = total(`tech'), by(appln_year ind90)
	drop `tech'
}
gen totalNAICS = tot/dup1 
egen total_ind90 = total(totalNAICS), by(appln_year ind90)
drop dup1 naics97_6 NAICSagg totalNAICS
duplicates drop appln_year ind90, force

* map ind90 into ind6090
*Autor, David H. "The Skill Content of Recent Technological Change: An Empirical Exploration" 
*Quarterly Journal of Economics, 118(4), November 2003, 1279-1334. Accessed November 2018. https://economics.mit.edu/people/faculty/david-h-autor/data-archive.
joinby ind90 using ${alm_data_raw}/concordances/ind90_sic/ind90.dta
drop ind7090 ind8090 dind8090 dind7090 mind8090 mind7090

* update the patent counts per ind6090
foreach tech in auto90 auto95 pauto90 pauto95 in_relevant_field in_relevant_field_b tot_b auto90_b auto95_b pauto90_b pauto95_b { 
	egen `tech' = total(`tech'_ind90), by(appln_year ind6090)
}
duplicates drop appln_year ind6090, force
keep ind6090 appln_year auto90 auto90_b auto95 auto95_b pauto90 pauto90_b pauto90_rm6 pauto90_rm6_b pauto95 pauto95_b in_relevant_field in_relevant_field_b
sort ind6090 appln_year

*labeling
ds *_b
local r: di r(varlist)
foreach var of local r {
    label var `var' "corrected for duplicates"
}

save ${alm_data_proc}/patents_ind6090_lz_ipc4.dta, replace


}
if _rc == 0 {
    display "Execution finished successfully."
}
else {
    display "Execution finished with errors."
}

cap log close dat